library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)
op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)
About Dataset This dataset is a copy of another Kaggle dataset which can be accessed here: https://www.kaggle.com/c/glioma-radiomics The difference is that I have provided ground truth for the test set (test_GT.csv).
The suffix “omics” in Medical Science is associated with analysis of big sets of features (e.g. Genomics, Proteomics). Radiomics are imaging features (e.g., first order and second order) extracted from Regions of Interest (ROI) in radiology images based on predefined functions and filters. Low grade gliomas (LGG) are a type of brain tumors. Astrocytes and Oligodendrocytes which are two types of brain cells, are considered as origins of LGG. Adult LGG are characterized by different mutations which is important to be correctly identified. With this dataset the goal is to determine if an ROI has 1p19q codeletion (Mutacion=1) or not(Mutacion=0). This plays a key role in predicting patient’s response to chemotherapy and their survival. The dataset provides 640 different radiomics features for each ROI. There are 105 ROIs in the training set and 45 ROIs in the test cohort.
https://www.kaggle.com/datasets/knamdar/radiomics-for-lgg-dataset?select=test_GT.csv
LGG_Data <- read.csv("~/GitHub/LatentBiomarkers/Data/LGG/train.csv")
LGG_DataTest <- read.csv("~/GitHub/LatentBiomarkers/Data/LGG/test.csv")
LGG_TestGT <- read.csv("~/GitHub/LatentBiomarkers/Data/LGG/test_GT.csv")
LGG_DataTest$Mutacion <- LGG_TestGT$Mutacion
LGG_Data <- rbind(LGG_Data,LGG_DataTest)
rownames(LGG_Data) <- LGG_Data$patientID
LGG_Data$patientID <- NULL
pander::pander(table(LGG_Data$Mutacion))
| 0 | 1 |
|---|---|
| 54 | 96 |
studyName <- "LGG"
dataframe <- LGG_Data
outcome <- "Mutacion"
TopVariables <- 10
thro <- 0.80
cexheat = 0.15
Some libraries
library(psych)
library(whitening)
library("vioplot")
library("rpart")
pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
| rows | col |
|---|---|
| 150 | 640 |
pander::pander(table(dataframe[,outcome]))
| 0 | 1 |
|---|---|
| 54 | 96 |
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
largeSet <- length(varlist) > 1500
Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns
### Some global cleaning
sdiszero <- apply(dataframe,2,sd) > 1.0e-16
dataframe <- dataframe[,sdiszero]
varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
dataframe <- dataframe[,tokeep]
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
iscontinous <- sapply(apply(dataframe,2,unique),length) >= 5 ## Only variables with enough samples
dataframeScaled <- FRESAScale(dataframe,method="OrderLogit")$scaledData
numsub <- nrow(dataframe)
if (numsub > 1000) numsub <- 1000
if (!largeSet)
{
hm <- heatMaps(data=dataframeScaled[1:numsub,],
Outcome=outcome,
Scale=TRUE,
hCluster = "row",
xlab="Feature",
ylab="Sample",
srtCol=45,
srtRow=45,
cexCol=cexheat,
cexRow=cexheat
)
par(op)
}
The heat map of the data
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
#cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
cormat <- cor(dataframe[,varlist],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Original Correlation",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Pearson Correlation|",
xlab="Feature", ylab="Feature")
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.9999991
DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#>
#> wavelet.HHH_glcm_DifferenceEntropy wavelet.HLH_gldm_DependenceNonUniformity original_glszm_GrayLevelNonUniformity wavelet.HLH_glszm_LargeAreaHighGrayLevelEmphasis wavelet.HHH_glszm_GrayLevelNonUniformityNormalized
#> original_firstorder_10Percentile original_firstorder_90Percentile
#> 0.20853081 0.23380727
#> original_firstorder_Energy original_firstorder_Entropy
#> 0.36966825 0.35071090
#> original_firstorder_InterquartileRange original_firstorder_Kurtosis
#> 0.38862559 0.03159558
#>
#> Included: 633 , Uni p: 0.0002369668 , Base Size: 5 , Rcrit: 0.281919
#>
#>
1 <R=1.000,thr=0.950>, Top: 83< 9 >[Fa= 83 ]( 83 , 288 , 0 ),<|><>Tot Used: 371 , Added: 288 , Zero Std: 0 , Max Cor: 1.000
#>
2 <R=1.000,thr=0.950>, Top: 51< 1 >[Fa= 133 ]( 51 , 111 , 83 ),<|><>Tot Used: 436 , Added: 111 , Zero Std: 0 , Max Cor: 0.999
#>
3 <R=0.999,thr=0.950>, Top: 11< 11 >[Fa= 144 ]( 11 , 23 , 133 ),<|><>Tot Used: 452 , Added: 23 , Zero Std: 0 , Max Cor: 0.967
#>
4 <R=0.967,thr=0.950>, Top: 2< 1 >[Fa= 146 ]( 2 , 2 , 144 ),<|><>Tot Used: 452 , Added: 2 , Zero Std: 0 , Max Cor: 0.949
#>
5 <R=0.949,thr=0.900>, Top: 62< 1 >[Fa= 167 ]( 61 , 88 , 146 ),<|><>Tot Used: 490 , Added: 88 , Zero Std: 0 , Max Cor: 1.000
#>
6 <R=1.000,thr=0.950>, Top: 7< 1 >[Fa= 169 ]( 7 , 7 , 167 ),<|><>Tot Used: 490 , Added: 7 , Zero Std: 0 , Max Cor: 0.944
#>
7 <R=0.944,thr=0.900>, Top: 12< 1 >[Fa= 171 ]( 11 , 15 , 169 ),<|><>Tot Used: 496 , Added: 15 , Zero Std: 0 , Max Cor: 1.000
#>
8 <R=1.000,thr=0.950>, Top: 1< 1 >[Fa= 171 ]( 1 , 1 , 171 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.935
#>
9 <R=0.935,thr=0.900>, Top: 1< 1 >[Fa= 171 ]( 1 , 1 , 171 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#>
10 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 171 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#>
11 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#>
12 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#>
13 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#>
14 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#>
15 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#>
16 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#>
17 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#>
18 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#>
19 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#>
20 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#>
21 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.994
#>
22 <R=0.994,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.899
#>
23 <R=0.899,thr=0.800>, Top: 81< 1 >[Fa= 200 ]( 76 , 102 , 172 ),<|><>Tot Used: 531 , Added: 102 , Zero Std: 0 , Max Cor: 0.997
#>
24 <R=0.997,thr=0.950>, Top: 9< 1 >[Fa= 206 ]( 9 , 9 , 200 ),<|><>Tot Used: 533 , Added: 9 , Zero Std: 0 , Max Cor: 1.000
#>
25 <R=1.000,thr=0.950>, Top: 2< 1 >[Fa= 207 ]( 2 , 2 , 206 ),<|><>Tot Used: 533 , Added: 2 , Zero Std: 0 , Max Cor: 0.980
#>
26 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#>
27 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#>
28 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#>
29 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#>
30 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#>
31 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#>
32 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#>
33 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#>
34 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#>
35 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#>
36 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#>
37 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#>
38 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#>
39 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#>
40 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#>
41 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#>
42 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.975
#>
43 <R=0.975,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.949
#>
44 <R=0.949,thr=0.900>, Top: 6< 1 >[Fa= 208 ]( 6 , 6 , 207 ),<|><>Tot Used: 533 , Added: 6 , Zero Std: 0 , Max Cor: 0.899
#>
45 <R=0.899,thr=0.800>, Top: 25< 1 >[Fa= 217 ]( 22 , 30 , 208 ),<|><>Tot Used: 548 , Added: 30 , Zero Std: 0 , Max Cor: 0.959
#>
46 <R=0.959,thr=0.950>, Top: 1< 1 >[Fa= 217 ]( 1 , 1 , 217 ),<|><>Tot Used: 548 , Added: 1 , Zero Std: 0 , Max Cor: 0.893
#>
47 <R=0.893,thr=0.800>, Top: 11< 1 >[Fa= 221 ]( 11 , 18 , 217 ),<|><>Tot Used: 556 , Added: 18 , Zero Std: 0 , Max Cor: 0.866
#>
48 <R=0.866,thr=0.800>, Top: 4< 1 >[Fa= 221 ]( 4 , 4 , 221 ),<|><>Tot Used: 556 , Added: 4 , Zero Std: 0 , Max Cor: 0.839
#>
49 <R=0.839,thr=0.800>, Top: 1< 1 >[Fa= 222 ]( 1 , 1 , 221 ),<|><>Tot Used: 556 , Added: 1 , Zero Std: 0 , Max Cor: 0.800
#>
50 <R=0.800,thr=0.800>
#>
[ 50 ], 0.7995444 Decor Dimension: 556 Nused: 556 . Cor to Base: 312 , ABase: 633 , Outcome Base: 0
#>
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]
pander::pander(sum(apply(dataframe[,varlist],2,var)))
6.77e+22
pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))
6.77e+22
pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))
0.000131
pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))
0.000131
varratio <- attr(DEdataframe,"VarRatio")
pander::pander(tail(varratio))
| La_wavelet.HHL_glcm_ClusterTendency | La_wavelet.HHH_glcm_ClusterTendency | La_wavelet.LHH_glcm_Contrast | La_wavelet.LHH_firstorder_Maximum | La_wavelet.HHH_firstorder_Maximum | La_wavelet.HLH_firstorder_Maximum |
|---|---|---|---|---|---|
| 3.54e-30 | 2.21e-30 | 1.58e-30 | 1.16e-30 | 9.35e-31 | 7.84e-31 |
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
UPLTM <- attr(DEdataframe,"UPLTM")
gplots::heatmap.2(1.0*(abs(UPLTM)>0),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Decorrelation matrix",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Beta|>0",
xlab="Output Feature", ylab="Input Feature")
par(op)
}
Displaying the features associations
par(op)
#if ((ncol(dataframe) < 1000) && (ncol(dataframe) > 10))
#{
# DEdataframeB <- ILAA(dataframe,verbose=TRUE,thr=thro,bootstrap=30)
transform <- attr(DEdataframe,"UPLTM") != 0
tnames <- colnames(transform)
colnames(transform) <- str_remove_all(colnames(transform),"La_")
transform <- abs(transform*cor(dataframe[,rownames(transform)])) # The weights are proportional to the observed correlation
VertexSize <- attr(DEdataframe,"fscore") # The size depends on the variable independence relevance (fscore)
names(VertexSize) <- str_remove_all(names(VertexSize),"La_")
VertexSize <- 10*(VertexSize-min(VertexSize))/(max(VertexSize)-min(VertexSize)) # Normalization
VertexSize <- VertexSize[rownames(transform)]
rsum <- apply(1*(transform !=0),1,sum) + 0.01*VertexSize + 0.001*varratio[tnames]
csum <- apply(1*(transform !=0),2,sum) + 0.01*VertexSize + 0.001*varratio[tnames]
ntop <- min(10,length(rsum))
topfeatures <- unique(c(names(rsum[order(-rsum)])[1:ntop],names(csum[order(-csum)])[1:ntop]))
rtrans <- transform[topfeatures,]
csum <- (apply(1*(rtrans !=0),2,sum) > 1)
rtrans <- rtrans[,csum]
topfeatures <- unique(c(topfeatures,colnames(rtrans)))
print(ncol(transform))
#> [1] 556
transform <- transform[topfeatures,topfeatures]
print(ncol(transform))
#> [1] 22
if (ncol(transform)>100)
{
csum <- (apply(1*(transform !=0),2,sum) > 1) & (apply(1*(transform !=0),1,sum) > 1)
transform <- transform[csum,csum]
print(ncol(transform))
}
if (ncol(transform) < 150)
{
gplots::heatmap.2(transform,
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Red Decorrelation matrix",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Beta|>0",
xlab="Output Feature", ylab="Input Feature")
par(op)
VertexSize <- VertexSize[colnames(transform)]
gr <- graph_from_adjacency_matrix(transform,mode = "directed",diag = FALSE,weighted=TRUE)
gr$layout <- layout_with_fr
fc <- cluster_optimal(gr)
plot(fc, gr,
edge.width = 2*E(gr)$weight,
vertex.size=VertexSize,
edge.arrow.size=0.5,
edge.arrow.width=0.5,
vertex.label.cex=(0.15+0.05*VertexSize),
vertex.label.dist=0.5 + 0.05*VertexSize,
main="Top Feature Association")
}
par(op)
if (!largeSet)
{
cormat <- cor(DEdataframe[,varlistc],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Correlation after ILAA",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Pearson Correlation|",
xlab="Feature", ylab="Feature")
par(op)
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.7995444
classes <- unique(dataframe[1:numsub,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
topvars <- univariate_BinEnsemble(dataframe,outcome)
lso <- LASSO_MIN(formula(paste(outcome,"~.")),dataframe,family="binomial")
topvars <- unique(c(names(topvars),lso$selectedfeatures))
pander::pander(head(topvars))
original_firstorder_Skewness, original_glcm_ClusterShade, original_gldm_LargeDependenceHighGrayLevelEmphasis, wavelet.HLL_glcm_ClusterShade, original_firstorder_Median and original_glcm_MaximumProbability
# names(topvars)
#if (nrow(dataframe) < 1000)
#{
datasetframe.umap = umap(scale(dataframe[1:numsub,topvars]),n_components=2)
# datasetframe.umap = umap(dataframe[1:numsub,varlist],n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[1:numsub,outcome],col=raincolors[dataframe[1:numsub,outcome]+1])
#}
varlistcV <- names(varratio[varratio >= 0.025])
topvars <- univariate_BinEnsemble(DEdataframe[,varlistcV],outcome)
lso <- LASSO_MIN(formula(paste(outcome,"~.")),DEdataframe,family="binomial")
topvars <- unique(c(names(topvars),lso$selectedfeatures))
pander::pander(head(topvars))
La_original_glcm_Idmn, original_firstorder_Skewness, original_glcm_ClusterShade, original_gldm_LargeDependenceHighGrayLevelEmphasis, La_original_glcm_Contrast and wavelet.HLL_glcm_ClusterShade
varlistcV <- varlistcV[varlistcV != outcome]
# DEdataframe[,outcome] <- as.numeric(DEdataframe[,outcome])
#if (nrow(dataframe) < 1000)
#{
datasetframe.umap = umap(scale(DEdataframe[1:numsub,topvars]),n_components=2)
# datasetframe.umap = umap(DEdataframe[1:numsub,varlistcV],n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After ILAA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[1:numsub,outcome],col=raincolors[DEdataframe[1:numsub,outcome]+1])
#}
univarRAW <- uniRankVar(varlist,
paste(outcome,"~1"),
outcome,
dataframe,
rankingTest="AUC")
100 : wavelet.HHH_firstorder_10Percentile 200 :
wavelet.HHL_firstorder_Uniformity 300 : wavelet.HLH_glcm_Imc1 400 :
wavelet.HLL_gldm_GrayLevelVariance 500 :
wavelet.LHH_glrlm_LongRunLowGrayLevelEmphasis
600 : wavelet.LHL_glszm_LargeAreaHighGrayLevelEmphasis
univarDe <- uniRankVar(varlistc,
paste(outcome,"~1"),
outcome,
DEdataframe,
rankingTest="AUC",
)
100 : La_wavelet.HHH_firstorder_10Percentile 200 :
La_wavelet.HHL_firstorder_Uniformity 300 : La_wavelet.HLH_glcm_Imc1 400
: La_wavelet.HLL_gldm_GrayLevelVariance 500 :
La_wavelet.LHH_glrlm_LongRunLowGrayLevelEmphasis
600 : La_wavelet.LHL_glszm_LargeAreaHighGrayLevelEmphasis
univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")
##top variables
topvar <- c(1:length(varlist)) <= TopVariables
tableRaw <- univarRAW$orderframe[topvar,univariate_columns]
pander::pander(tableRaw)
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| original_firstorder_Skewness | -3.93e-02 | 5.28e-01 | -6.84e-01 | 6.34e-01 | 9.78e-01 | 0.780 |
| original_glcm_ClusterShade | 7.42e+02 | 1.02e+04 | -1.06e+04 | 1.65e+04 | 4.41e-02 | 0.755 |
| wavelet.HLL_glcm_ClusterShade | -7.99e+02 | 3.12e+03 | 2.32e+03 | 4.29e+03 | 4.72e-02 | 0.737 |
| wavelet.HLL_firstorder_Skewness | -1.05e-02 | 5.50e-01 | 5.21e-01 | 8.55e-01 | 5.54e-02 | 0.736 |
| original_firstorder_Median | 2.77e+02 | 5.52e+01 | 3.33e+02 | 8.63e+01 | 6.29e-01 | 0.732 |
| original_glcm_MaximumProbability | 6.19e-03 | 2.32e-03 | 1.02e-02 | 6.26e-03 | 7.10e-02 | 0.722 |
| original_gldm_LargeDependenceHighGrayLevelEmphasis | 1.36e+04 | 9.87e+03 | 3.19e+04 | 3.44e+04 | 2.54e-03 | 0.711 |
| original_glcm_JointEnergy | 1.72e-03 | 7.60e-04 | 2.89e-03 | 2.23e-03 | 1.21e-02 | 0.710 |
| original_firstorder_Mean | 2.75e+02 | 5.33e+01 | 3.20e+02 | 7.86e+01 | 3.40e-01 | 0.702 |
| original_glszm_LargeAreaHighGrayLevelEmphasis | 2.34e+04 | 4.63e+04 | 1.77e+05 | 7.31e+05 | 1.50e-08 | 0.700 |
topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]
pander::pander(finalTable)
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| original_firstorder_Skewness | -3.93e-02 | 5.28e-01 | -6.84e-01 | 6.34e-01 | 0.9782 | 0.780 |
| La_original_glcm_Idmn | 6.35e-01 | 1.44e-03 | 6.33e-01 | 2.85e-03 | 0.4863 | 0.766 |
| original_glcm_ClusterShade | 7.42e+02 | 1.02e+04 | -1.06e+04 | 1.65e+04 | 0.0441 | 0.755 |
| La_original_firstorder_Energy | -6.46e+07 | 1.25e+08 | 6.10e+07 | 1.57e+08 | 0.0136 | 0.742 |
| La_wavelet.HLL_firstorder_InterquartileRange | 1.31e+00 | 1.81e+00 | -4.36e-01 | 2.71e+00 | 0.2171 | 0.742 |
| La_wavelet.LHL_glszm_GrayLevelNonUniformity | 5.38e+00 | 4.68e+00 | 2.30e+00 | 3.15e+00 | 0.4485 | 0.739 |
| wavelet.HLL_glcm_ClusterShade | -7.99e+02 | 3.12e+03 | 2.32e+03 | 4.29e+03 | 0.0472 | 0.737 |
| La_wavelet.HLL_glszm_GrayLevelNonUniformity | 4.79e+00 | 4.40e+00 | 1.24e+00 | 3.81e+00 | 0.9080 | 0.733 |
| La_original_glcm_Contrast | -1.43e+02 | 1.58e+01 | -1.23e+02 | 3.50e+01 | 0.2274 | 0.730 |
| La_wavelet.HLL_glrlm_RunLengthNonUniformity | 4.99e+01 | 3.00e+01 | 2.62e+01 | 3.48e+01 | 0.0971 | 0.730 |
dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")
pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
| mean | total | fraction |
|---|---|---|
| 2.52 | 499 | 0.788 |
theCharformulas <- attr(dc,"LatentCharFormulas")
finalTable <- rbind(finalTable,tableRaw[topvar[!(topvar %in% topLAvar)],univariate_columns])
orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- theCharformulas[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]
Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")
finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
| DecorFormula | caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | RAWAUC | fscores | |
|---|---|---|---|---|---|---|---|---|---|
| original_firstorder_Skewness | NA | -3.93e-02 | 5.28e-01 | -6.84e-01 | 6.34e-01 | 9.78e-01 | 0.780 | 0.780 | 0 |
| original_firstorder_Skewness1 | NA | -3.93e-02 | 5.28e-01 | -6.84e-01 | 6.34e-01 | 9.78e-01 | 0.780 | NA | NA |
| La_original_glcm_Idmn | + original_glcm_Idmn - (0.379)original_glcm_Idn | 6.35e-01 | 1.44e-03 | 6.33e-01 | 2.85e-03 | 4.86e-01 | 0.766 | 0.542 | -1 |
| original_glcm_ClusterShade | NA | 7.42e+02 | 1.02e+04 | -1.06e+04 | 1.65e+04 | 4.41e-02 | 0.755 | 0.755 | 0 |
| original_glcm_ClusterShade1 | NA | 7.42e+02 | 1.02e+04 | -1.06e+04 | 1.65e+04 | 4.41e-02 | 0.755 | NA | NA |
| La_original_firstorder_Energy | + original_firstorder_Energy - (4.34e+05)wavelet.HLL_glrlm_RunLengthNonUniformity | -6.46e+07 | 1.25e+08 | 6.10e+07 | 1.57e+08 | 1.36e-02 | 0.742 | 0.499 | -1 |
| La_wavelet.HLL_firstorder_InterquartileRange | + wavelet.HLL_firstorder_InterquartileRange - (2.272)wavelet.HLL_firstorder_RobustMeanAbsoluteDeviation | 1.31e+00 | 1.81e+00 | -4.36e-01 | 2.71e+00 | 2.17e-01 | 0.742 | 0.543 | -1 |
| La_wavelet.LHL_glszm_GrayLevelNonUniformity | - (0.016)wavelet.HLL_glrlm_RunLengthNonUniformity + wavelet.LHL_glszm_GrayLevelNonUniformity | 5.38e+00 | 4.68e+00 | 2.30e+00 | 3.15e+00 | 4.48e-01 | 0.739 | 0.607 | -1 |
| wavelet.HLL_glcm_ClusterShade | NA | -7.99e+02 | 3.12e+03 | 2.32e+03 | 4.29e+03 | 4.72e-02 | 0.737 | 0.737 | 0 |
| wavelet.HLL_glcm_ClusterShade1 | NA | -7.99e+02 | 3.12e+03 | 2.32e+03 | 4.29e+03 | 4.72e-02 | 0.737 | NA | NA |
| wavelet.HLL_firstorder_Skewness | NA | -1.05e-02 | 5.50e-01 | 5.21e-01 | 8.55e-01 | 5.54e-02 | 0.736 | 0.736 | NA |
| La_wavelet.HLL_glszm_GrayLevelNonUniformity | - (0.016)wavelet.HLL_glrlm_RunLengthNonUniformity + wavelet.HLL_glszm_GrayLevelNonUniformity | 4.79e+00 | 4.40e+00 | 1.24e+00 | 3.81e+00 | 9.08e-01 | 0.733 | 0.613 | -1 |
| original_firstorder_Median | NA | 2.77e+02 | 5.52e+01 | 3.33e+02 | 8.63e+01 | 6.29e-01 | 0.732 | 0.732 | NA |
| La_original_glcm_Contrast | + original_glcm_Contrast - (34.029)original_glcm_DifferenceAverage | -1.43e+02 | 1.58e+01 | -1.23e+02 | 3.50e+01 | 2.27e-01 | 0.730 | 0.527 | 0 |
| La_wavelet.HLL_glrlm_RunLengthNonUniformity | - (0.126)original_shape_MeshVolume + (4.206)wavelet.HLL_glrlm_GrayLevelNonUniformity + wavelet.HLL_glrlm_RunLengthNonUniformity | 4.99e+01 | 3.00e+01 | 2.62e+01 | 3.48e+01 | 9.71e-02 | 0.730 | 0.565 | 4 |
| original_glcm_MaximumProbability | NA | 6.19e-03 | 2.32e-03 | 1.02e-02 | 6.26e-03 | 7.10e-02 | 0.722 | 0.722 | NA |
| original_gldm_LargeDependenceHighGrayLevelEmphasis | NA | 1.36e+04 | 9.87e+03 | 3.19e+04 | 3.44e+04 | 2.54e-03 | 0.711 | 0.711 | 0 |
| original_glcm_JointEnergy | NA | 1.72e-03 | 7.60e-04 | 2.89e-03 | 2.23e-03 | 1.21e-02 | 0.710 | 0.710 | NA |
| original_firstorder_Mean | NA | 2.75e+02 | 5.33e+01 | 3.20e+02 | 7.86e+01 | 3.40e-01 | 0.702 | 0.702 | NA |
| original_glszm_LargeAreaHighGrayLevelEmphasis | NA | 2.34e+04 | 4.63e+04 | 1.77e+05 | 7.31e+05 | 1.50e-08 | 0.700 | 0.700 | NA |
featuresnames <- colnames(dataframe)[colnames(dataframe) != outcome]
pc <- prcomp(dataframe[,iscontinous],center = TRUE,scale. = TRUE) #principal components
predPCA <- predict(pc,dataframe[,iscontinous])
PCAdataframe <- as.data.frame(cbind(predPCA,dataframe[,!iscontinous]))
colnames(PCAdataframe) <- c(colnames(predPCA),colnames(dataframe)[!iscontinous])
#plot(PCAdataframe[,colnames(PCAdataframe)!=outcome],col=dataframe[,outcome],cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)
#pander::pander(pc$rotation)
PCACor <- cor(PCAdataframe[,colnames(PCAdataframe) != outcome])
gplots::heatmap.2(abs(PCACor),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "PCA Correlation",
cexRow = 0.5,
cexCol = 0.5,
srtCol=45,
srtRow= -45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
EFAdataframe <- dataframeScaled
if (length(iscontinous) < 2000)
{
topred <- min(length(iscontinous),nrow(dataframeScaled),ncol(predPCA)/2)
if (topred < 2) topred <- 2
uls <- fa(dataframeScaled[,iscontinous],nfactors=topred,rotate="varimax",warnings=FALSE) # EFA analysis
predEFA <- predict(uls,dataframeScaled[,iscontinous])
EFAdataframe <- as.data.frame(cbind(predEFA,dataframeScaled[,!iscontinous]))
colnames(EFAdataframe) <- c(colnames(predEFA),colnames(dataframeScaled)[!iscontinous])
EFACor <- cor(EFAdataframe[,colnames(EFAdataframe) != outcome])
gplots::heatmap.2(abs(EFACor),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "EFA Correlation",
cexRow = 0.5,
cexCol = 0.5,
srtCol=45,
srtRow= -45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
}
par(op)
par(xpd = TRUE)
dataframe[,outcome] <- factor(dataframe[,outcome])
rawmodel <- rpart(paste(outcome,"~."),dataframe,control=rpart.control(maxdepth=3))
pr <- predict(rawmodel,dataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(rawmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,dataframe[,outcome]==0))
}
pander::pander(table(dataframe[,outcome],pr))
| 0 | 1 | |
|---|---|---|
| 0 | 46 | 8 |
| 1 | 9 | 87 |
pander::pander(ptab$detail[c(5,3,4,6),])
| statistic | est | lower | upper | |
|---|---|---|---|---|
| 5 | diag.ac | 0.887 | 0.825 | 0.933 |
| 3 | se | 0.906 | 0.829 | 0.956 |
| 4 | sp | 0.852 | 0.729 | 0.934 |
| 6 | diag.or | 55.583 | 20.099 | 153.711 |
par(op)
par(xpd = TRUE)
DEdataframe[,outcome] <- factor(DEdataframe[,outcome])
IDeAmodel <- rpart(paste(outcome,"~."),DEdataframe[,c(outcome,varlistcV)],control=rpart.control(maxdepth=3))
pr <- predict(IDeAmodel,DEdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(IDeAmodel,main="ILAA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(IDeAmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,DEdataframe[,outcome]==0))
}
pander::pander(table(DEdataframe[,outcome],pr))
| 0 | 1 | |
|---|---|---|
| 0 | 42 | 12 |
| 1 | 7 | 89 |
pander::pander(ptab$detail[c(5,3,4,6),])
| statistic | est | lower | upper | |
|---|---|---|---|---|
| 5 | diag.ac | 0.873 | 0.809 | 0.922 |
| 3 | se | 0.927 | 0.856 | 0.970 |
| 4 | sp | 0.778 | 0.644 | 0.880 |
| 6 | diag.or | 44.500 | 16.342 | 121.177 |
par(op)
par(xpd = TRUE)
PCAdataframe[,outcome] <- factor(PCAdataframe[,outcome])
PCAmodel <- rpart(paste(outcome,"~."),PCAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(PCAmodel,PCAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(PCAmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,PCAdataframe[,outcome]==0))
}
pander::pander(table(PCAdataframe[,outcome],pr))
| 0 | 1 | |
|---|---|---|
| 0 | 38 | 16 |
| 1 | 14 | 82 |
pander::pander(ptab$detail[c(5,3,4,6),])
| statistic | est | lower | upper | |
|---|---|---|---|---|
| 5 | diag.ac | 0.800 | 0.727 | 0.861 |
| 3 | se | 0.854 | 0.767 | 0.918 |
| 4 | sp | 0.704 | 0.564 | 0.820 |
| 6 | diag.or | 13.911 | 6.164 | 31.392 |
par(op)
EFAdataframe[,outcome] <- factor(EFAdataframe[,outcome])
EFAmodel <- rpart(paste(outcome,"~."),EFAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(EFAmodel,EFAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(EFAmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,EFAdataframe[,outcome]==0))
}
pander::pander(table(EFAdataframe[,outcome],pr))
| 0 | 1 | |
|---|---|---|
| 0 | 27 | 27 |
| 1 | 3 | 93 |
pander::pander(ptab$detail[c(5,3,4,6),])
| statistic | est | lower | upper | |
|---|---|---|---|---|
| 5 | diag.ac | 0.800 | 0.727 | 0.861 |
| 3 | se | 0.969 | 0.911 | 0.994 |
| 4 | sp | 0.500 | 0.361 | 0.639 |
| 6 | diag.or | 31.000 | 8.728 | 110.102 |
par(op)